//
//  Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
//
//  Distributed under the Boost Software License, Version 1.0. (See
//  accompanying file LICENSE_1_0.txt or copy at
//  http://www.boost.org/LICENSE_1_0.txt)
//
#ifndef NOWIDE_UTF_HPP_INCLUDED
#    define NOWIDE_UTF_HPP_INCLUDED

#    include <nowide/config.hpp>

#    ifndef NOWIDE_MSVC
namespace nowide
{
    namespace utf
    {
        typedef unsigned       uint32_t;
        typedef unsigned short uint16_t;
        typedef unsigned char  uint8_t;
    }    // namespace utf
}    // namespace nowide
#    else
#        include <stdint.h>
#    endif

namespace nowide
{
    ///
    /// \brief Namespace that holds basic operations on UTF encoded sequences
    ///
    /// All functions defined in this namespace do not require linking with Boost.Locale library
    ///
    namespace utf
    {
/// \cond INTERNAL
#    ifdef __GNUC__
#        define NOWIDE_LIKELY(x) __builtin_expect((x), 1)
#        define NOWIDE_UNLIKELY(x) __builtin_expect((x), 0)
#    else
#        define NOWIDE_LIKELY(x) (x)
#        define NOWIDE_UNLIKELY(x) (x)
#    endif
        /// \endcond

        ///
        /// \brief The integral type type that can hold a Unicode code point
        ///
        typedef uint32_t code_point;

        ///
        /// \brief Special constant that defines illegal code point
        ///
        static const code_point illegal = 0xFFFFFFFFu;

        ///
        /// \brief Special constant that defines incomplete code point
        ///
        static const code_point incomplete = 0xFFFFFFFEu;

        ///
        /// \brief the function checks if \a v is a valid code point
        ///
        inline bool is_valid_codepoint(code_point v)
        {
            if (v > 0x10FFFF) return false;
            if (0xD800 <= v && v <= 0xDFFF)    // surragates
                return false;
            return true;
        }

#    ifdef NOWIDE_DOXYGEN
        ///
        /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
        ///
        template<typename CharType, int size = sizeof(CharType)>
        struct utf_traits
        {
            ///
            /// The type of the character
            ///
            typedef CharType char_type;
            ///
            /// Read one code point from the range [p,e) and return it.
            ///
            /// - If the sequence that was read is incomplete sequence returns \ref incomplete,
            /// - If illegal sequence detected returns \ref illegal
            ///
            /// Requirements
            ///
            /// - Iterator is valid input iterator
            ///
            /// Postconditions
            ///
            /// - p points to the last consumed character
            ///
            template<typename Iterator>
            static code_point decode(Iterator& p, Iterator e);

            ///
            /// Maximal width of valid sequence in the code units:
            ///
            /// - UTF-8  - 4
            /// - UTF-16 - 2
            /// - UTF-32 - 1
            ///
            static const int max_width;
            ///
            /// The width of specific code point in the code units.
            ///
            /// Requirement: value is a valid Unicode code point
            /// Returns value in range [1..max_width]
            ///
            static int width(code_point value);

            ///
            /// Get the size of the trail part of variable length encoded sequence.
            ///
            /// Returns -1 if C is not valid lead character
            ///
            static int trail_length(char_type c);
            ///
            /// Returns true if c is trail code unit, always false for UTF-32
            ///
            static bool is_trail(char_type c);
            ///
            /// Returns true if c is lead code unit, always true of UTF-32
            ///
            static bool is_lead(char_type c);

            ///
            /// Convert valid Unicode code point \a value to the UTF sequence.
            ///
            /// Requirements:
            ///
            /// - \a value is valid code point
            /// - \a out is an output iterator should be able to accept at least width(value) units
            ///
            /// Returns the iterator past the last written code unit.
            ///
            template<typename Iterator>
            static Iterator encode(code_point value, Iterator out);
            ///
            /// Decodes valid UTF sequence that is pointed by p into code point.
            ///
            /// If the sequence is invalid or points to end the behavior is undefined
            ///
            template<typename Iterator>
            static code_point decode_valid(Iterator& p);
        };

#    else

        template<typename CharType, int size = sizeof(CharType)>
        struct utf_traits;

        template<typename CharType>
        struct utf_traits<CharType, 1>
        {
            typedef CharType char_type;

            static int trail_length(char_type ci)
            {
                unsigned char c = ci;
                if (c < 128) return 0;
                if (NOWIDE_UNLIKELY(c < 194)) return -1;
                if (c < 224) return 1;
                if (c < 240) return 2;
                if (NOWIDE_LIKELY(c <= 244)) return 3;
                return -1;
            }

            static const int max_width = 4;

            static int width(code_point value)
            {
                if (value <= 0x7F) {
                    return 1;
                }
                else if (value <= 0x7FF) {
                    return 2;
                }
                else if (NOWIDE_LIKELY(value <= 0xFFFF)) {
                    return 3;
                }
                else {
                    return 4;
                }
            }

            static bool is_trail(char_type ci)
            {
                unsigned char c = ci;
                return (c & 0xC0) == 0x80;
            }

            static bool is_lead(char_type ci)
            {
                return !is_trail(ci);
            }

            template<typename Iterator>
            static code_point decode(Iterator& p, Iterator e)
            {
                if (NOWIDE_UNLIKELY(p == e)) return incomplete;

                unsigned char lead = *p++;

                // First byte is fully validated here
                int trail_size = trail_length(lead);

                if (NOWIDE_UNLIKELY(trail_size < 0)) return illegal;

                //
                // Ok as only ASCII may be of size = 0
                // also optimize for ASCII text
                //
                if (trail_size == 0) return lead;

                code_point c = lead & ((1 << (6 - trail_size)) - 1);

                // Read the rest
                unsigned char tmp;
                switch (trail_size) {
                    case 3:
                        if (NOWIDE_UNLIKELY(p == e)) return incomplete;
                        tmp = *p++;
                        if (!is_trail(tmp)) return illegal;
                        c = (c << 6) | (tmp & 0x3F);
                    case 2:
                        if (NOWIDE_UNLIKELY(p == e)) return incomplete;
                        tmp = *p++;
                        if (!is_trail(tmp)) return illegal;
                        c = (c << 6) | (tmp & 0x3F);
                    case 1:
                        if (NOWIDE_UNLIKELY(p == e)) return incomplete;
                        tmp = *p++;
                        if (!is_trail(tmp)) return illegal;
                        c = (c << 6) | (tmp & 0x3F);
                }

                // Check code point validity: no surrogates and
                // valid range
                if (NOWIDE_UNLIKELY(!is_valid_codepoint(c))) return illegal;

                // make sure it is the most compact representation
                if (NOWIDE_UNLIKELY(width(c) != trail_size + 1)) return illegal;

                return c;
            }

            template<typename Iterator>
            static code_point decode_valid(Iterator& p)
            {
                unsigned char lead = *p++;
                if (lead < 192) return lead;

                int trail_size;

                if (lead < 224)
                    trail_size = 1;
                else if (NOWIDE_LIKELY(lead < 240))    // non-BMP rare
                    trail_size = 2;
                else
                    trail_size = 3;

                code_point c = lead & ((1 << (6 - trail_size)) - 1);

                switch (trail_size) {
                    case 3:
                        c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
                    case 2:
                        c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
                    case 1:
                        c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
                }

                return c;
            }

            template<typename Iterator>
            static Iterator encode(code_point value, Iterator out)
            {
                if (value <= 0x7F) {
                    *out++ = static_cast<char_type>(value);
                }
                else if (value <= 0x7FF) {
                    *out++ = static_cast<char_type>((value >> 6) | 0xC0);
                    *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
                }
                else if (NOWIDE_LIKELY(value <= 0xFFFF)) {
                    *out++ = static_cast<char_type>((value >> 12) | 0xE0);
                    *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
                    *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
                }
                else {
                    *out++ = static_cast<char_type>((value >> 18) | 0xF0);
                    *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
                    *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
                    *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
                }
                return out;
            }
        };    // utf8

        template<typename CharType>
        struct utf_traits<CharType, 2>
        {
            typedef CharType char_type;

            // See RFC 2781
            static bool is_first_surrogate(uint16_t x)
            {
                return 0xD800 <= x && x <= 0xDBFF;
            }
            static bool is_second_surrogate(uint16_t x)
            {
                return 0xDC00 <= x && x <= 0xDFFF;
            }
            static code_point combine_surrogate(uint16_t w1, uint16_t w2)
            {
                return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
            }
            static int trail_length(char_type c)
            {
                if (is_first_surrogate(c)) return 1;
                if (is_second_surrogate(c)) return -1;
                return 0;
            }
            ///
            /// Returns true if c is trail code unit, always false for UTF-32
            ///
            static bool is_trail(char_type c)
            {
                return is_second_surrogate(c);
            }
            ///
            /// Returns true if c is lead code unit, always true of UTF-32
            ///
            static bool is_lead(char_type c)
            {
                return !is_second_surrogate(c);
            }

            template<typename It>
            static code_point decode(It& current, It last)
            {
                if (NOWIDE_UNLIKELY(current == last)) return incomplete;
                uint16_t w1 = *current++;
                if (NOWIDE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
                    return w1;
                }
                if (w1 > 0xDBFF) return illegal;
                if (current == last) return incomplete;
                uint16_t w2 = *current++;
                if (w2 < 0xDC00 || 0xDFFF < w2) return illegal;
                return combine_surrogate(w1, w2);
            }
            template<typename It>
            static code_point decode_valid(It& current)
            {
                uint16_t w1 = *current++;
                if (NOWIDE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
                    return w1;
                }
                uint16_t w2 = *current++;
                return combine_surrogate(w1, w2);
            }

            static const int max_width = 2;
            static int       width(code_point u)
            {
                return u >= 0x10000 ? 2 : 1;
            }
            template<typename It>
            static It encode(code_point u, It out)
            {
                if (NOWIDE_LIKELY(u <= 0xFFFF)) {
                    *out++ = static_cast<char_type>(u);
                }
                else {
                    u -= 0x10000;
                    *out++ = static_cast<char_type>(0xD800 | (u >> 10));
                    *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
                }
                return out;
            }
        };    // utf16;

        template<typename CharType>
        struct utf_traits<CharType, 4>
        {
            typedef CharType char_type;
            static int       trail_length(char_type c)
            {
                if (is_valid_codepoint(c)) return 0;
                return -1;
            }
            static bool is_trail(char_type /*c*/)
            {
                return false;
            }
            static bool is_lead(char_type /*c*/)
            {
                return true;
            }

            template<typename It>
            static code_point decode_valid(It& current)
            {
                return *current++;
            }

            template<typename It>
            static code_point decode(It& current, It last)
            {
                if (NOWIDE_UNLIKELY(current == last)) return nowide::utf::incomplete;
                code_point c = *current++;
                if (NOWIDE_UNLIKELY(!is_valid_codepoint(c))) return nowide::utf::illegal;
                return c;
            }
            static const int max_width = 1;
            static int       width(code_point /*u*/)
            {
                return 1;
            }
            template<typename It>
            static It encode(code_point u, It out)
            {
                *out++ = static_cast<char_type>(u);
                return out;
            }

        };    // utf32

#    endif

    }    // namespace utf
}    // namespace nowide

#endif

// vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
